##
## ==> 2024-04-24 21:02:41 loading required packages ...
## set working directory
outdir <- file.path(workdir, "output", fsep = "/")
gff_file <- file.path(workdir, "../genomes/GCA_000014265.1_ASM1426v1_genomic.gff", fsep = "/")
count_file=paste(workdir, "Tricho_featureCounts_result.tsv", sep="/")
metadata_file=paste(workdir, "metadata.tsv", sep="/")
if (workdir != getwd()) system(paste("mkdir -p", workdir, sep=" "))
setwd(workdir)
if (!dir.exists(outdir)) {
dir.create(outdir)
} else {
cat("\n==>", as.character(Sys.time()), "Output directory ", outdir, "exists, will use it.\n")
}
##
## ==> 2024-04-24 21:02:50 Output directory /Users/shengwei/GitHub/Tricho-Transcriptomics/differential-expression-analysis/output exists, will use it.
prefix ="Tricho"
prefix <- paste(outdir, prefix, sep="/")
# read in gff file
gff <- gffReader(gff_file)
##
## ==> 2024-04-24 21:02:50 reading gff file /Users/shengwei/GitHub/Tricho-Transcriptomics/differential-expression-analysis/../genomes/GCA_000014265.1_ASM1426v1_genomic.gff
##
## ==> 2024-04-24 21:02:50 class of gff is data.frame
##
## ==> 2024-04-24 21:02:50 found 9672 rows with classes: character, character, character, integer, integer, character, character, character, character
##
## ==> 2024-04-24 21:02:50 found 9672 rows after filtration!
#View(gff)
# get gene_gff
gene_gff <- gff[gff$feature =="gene", ]
#fix(gene_gff)
#View(gene_gff)
# only keep features that are not "gene",
otherFeatures <- unique(gff[, 3])
#otherFeatures
otherFeatures <- otherFeatures[-which(otherFeatures %in% c("gene", "region", "exon"))]
#otherFeatures
# get other_gff
other_gff <- data.frame(seqname=character(0), source=character(0), feature=character(0),
start=integer(0), end=integer(0), score=character(0),
strand=character(0), frame=character(0), atrributes=character(0))
for (i in 1:length(otherFeatures)){
#print(otherFeatures[i])
other_gff <- rbind(other_gff, gff[gff$feature == otherFeatures[i], ])
}
#fix(other_gff)
# bind geneAttributesColumn and geneID to other_gff
other_gff <- addGeneAttributesColumn(other_gff, gene_gff)
locus_tag <- getAttributeField(other_gff[, "geneAttributes"], "locus_tag", attrsep = ";")
other_gff <- cbind(other_gff, locus_tag=locus_tag)
#head(other_gff)
#View(other_gff)
# extract product, make annotations
product <- getAttributeField(other_gff[, "attributes"], "product", attrsep = ";")
dbxref <- getAttributeField(other_gff[, "attributes"], "Dbxref", attrsep = ";")
note <- getAttributeField(other_gff[, "attributes"], "Note", attrsep = ";")
annotations <- other_gff[, c("geneID", "locus_tag", "seqname", "feature", "start", "end", "strand", "attributes")]
annotations <- cbind(annotations, product, dbxref, note)
# reorganize the order of product column
annotations <- annotations[, c("geneID", "locus_tag", "feature", "start", "end", "strand", "seqname", "product", "dbxref", "note", "attributes")]
#View(annotations)
#head(annotations)
cat("\n==>", as.character(Sys.time()), "final annotations look like: \n")
##
## ==> 2024-04-24 21:02:52 final annotations look like:
head(annotations, 2)
## geneID locus_tag feature start end strand seqname
## 3 gene0 Tery_0001 CDS 27 1397 + CP000393.1
## 5 gene1 Tery_0002 CDS 1489 1689 - CP000393.1
## product
## 3 chromosomal replication initiator protein DnaA
## 5 putative transposase gene of IS630 family insertion sequence ISY100h
## dbxref
## 3 InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR013159,InterPro:IPR013317,NCBI_GP:ABG49531.1
## 5 NCBI_GP:ABG49532.1
## note
## 3 KEGG: ana:alr2009 chromosomal replication initiator protein~TIGRFAM: chromosomal replication initiator protein DnaA~PFAM: Chromosomal replication initiator%2C DnaA-like Chromosomal replication initiator%2C DnaA~SMART: ATPase
## 5 KEGG: syn:sll0431 putative transposase gene of IS630 family insertion sequence ISY100h
## attributes
## 3 ID=cds0;Parent=gene0;Dbxref=InterPro:IPR001957,InterPro:IPR003593,InterPro:IPR013159,InterPro:IPR013317,NCBI_GP:ABG49531.1;Name=ABG49531.1;Note=KEGG: ana:alr2009 chromosomal replication initiator protein~TIGRFAM: chromosomal replication initiator protein DnaA~PFAM: Chromosomal replication initiator%2C DnaA-like Chromosomal replication initiator%2C DnaA~SMART: ATPase;gbkey=CDS;product=chromosomal replication initiator protein DnaA;protein_id=ABG49531.1;transl_table=11
## 5 ID=cds1;Parent=gene1;Dbxref=NCBI_GP:ABG49532.1;Name=ABG49532.1;Note=KEGG: syn:sll0431 putative transposase gene of IS630 family insertion sequence ISY100h;gbkey=CDS;product=putative transposase gene of IS630 family insertion sequence ISY100h;protein_id=ABG49532.1;transl_table=11
rawCounts <- read.table(file = count_file, header = T, sep = "\t", as.is = T, stringsAsFactors = F)
#fix(rawCounts)
#head(rawCounts)
#ncol(rawCounts)
# get counts table
rawTable <- rawCounts[, 7:ncol(rawCounts)]
rownames(rawTable) <- rawCounts[, 1]
#head(rawTable)
counts <- sapply(rawTable, as.numeric)
rownames(counts) <- rownames(rawTable)
cat("\n==>", as.character(Sys.time()), " raw count table looks like: \n")
##
## ==> 2024-04-24 21:02:52 raw count table looks like:
head(counts, 2)
## HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2
## Tery_0001 328 518 394 303 537
## Tery_0002 20 35 34 17 31
## MFe_d5_rep1 MFe_d5_rep2
## Tery_0001 473 262
## Tery_0002 17 18
# calculate count per million
cpms <- cpm(counts)
#head(cpms)
cat("\n==>", as.character(Sys.time()), " Dimension before filtering: ", dim(cpms), "\n")
##
## ==> 2024-04-24 21:02:52 Dimension before filtering: 4499 7
# keep rows meet requirements: at least 1 column has cpm more than 1, rowSums more than 0.5*ncol
keep <- rowSums(cpms > 1) >=1 & rowSums(cpms) >= ncol(cpms)*0.5
counts <- counts[keep, ]
cat("\n==>", as.character(Sys.time()), " Dimension after filtering weakly expressed genes: ", dim(counts), "\n")
##
## ==> 2024-04-24 21:02:52 Dimension after filtering weakly expressed genes: 4249 7
# remove rows belongs to rRNA and tRNA
#View(other_gff)
#remove <- other_gff[other_gff$feature == "rRNA" | other_gff$feature == "tRNA", "geneID"]
remove <- other_gff[other_gff$feature == "rRNA", "geneID"]
#remove
counts <- counts[!row.names(counts)%in%remove, ]
#head(counts)
#dim(counts)
cat("\n==>", as.character(Sys.time()), " Dimension after removing weakly expressed genes and rRNA/tRNA: ", dim(counts))
##
## ==> 2024-04-24 21:02:52 Dimension after removing weakly expressed genes and rRNA/tRNA: 4249 7
#View(counts)
cat("\n==>", as.character(Sys.time()), " making metatable ...\n")
##
## ==> 2024-04-24 21:02:52 making metatable ...
# build a meta table, giving all experimental factors
#colnames(counts)
# read in metadata
metadata <- read.table(metadata_file, header = TRUE, as.is = TRUE, stringsAsFactors = FALSE, row.names = 1)
if(!all.equal(rownames(metadata), colnames(counts))) {
stop("sample names in metadata didn't match column names in count table!\n")
}
cat("\n==>", as.character(Sys.time()), " metadata looks like: \n")
##
## ==> 2024-04-24 21:02:52 metadata looks like:
head(metadata,7)
## day replicate treatment treatment_day color pch
## HFe_d5_rep1 5 1 HFe HFe_d5 forestgreen 16
## HFe_d5_rep2 5 2 HFe HFe_d5 forestgreen 16
## HFe_d5_rep3 5 3 HFe HFe_d5 forestgreen 16
## LFe_d5_rep1 5 1 LFe LFe_d5 red 16
## LFe_d5_rep2 5 2 LFe LFe_d5 red 16
## MFe_d5_rep1 5 1 MFe MFe_d5 yellow3 16
## MFe_d5_rep2 5 2 MFe MFe_d5 yellow3 16
cat("\n==>", as.character(Sys.time()), " running edgeR TMM normalization ...\n")
##
## ==> 2024-04-24 21:02:52 running edgeR TMM normalization ...
# make DGEList
cat("\n==>", as.character(Sys.time()), " making DEGList with counts and metadata ...\n")
##
## ==> 2024-04-24 21:02:52 making DEGList with counts and metadata ...
d <- DGEList(counts=counts, group=as.factor(metadata$treatment))
#d
cat("\n==>", as.character(Sys.time()), " calculating normalization factors using TMM method ...\n")
##
## ==> 2024-04-24 21:02:52 calculating normalization factors using TMM method ...
d <- calcNormFactors(d, method = "TMM")
#d
cat("\n==>", as.character(Sys.time()), " calculating count per million (cpm) using normalized DGEList object ...\n")
##
## ==> 2024-04-24 21:02:52 calculating count per million (cpm) using normalized DGEList object ...
cpms<- cpm(d, normalized.lib.sizes=TRUE)
#head(cpms)
cat("\n==>", as.character(Sys.time()), " formating cpm counts to scientific format with 2 digits ...\n")
##
## ==> 2024-04-24 21:02:52 formating cpm counts to scientific format with 2 digits ...
cpms.formatted <- format(cpms, digits=2, scientific = F)
colnames(cpms.formatted) <- paste(colnames(cpms.formatted), "_CPM", sep = "")
cat("\n==>", as.character(Sys.time()), " formated cpm looks like this:\n")
##
## ==> 2024-04-24 21:02:52 formated cpm looks like this:
head(cpms.formatted, 2)
## HFe_d5_rep1_CPM HFe_d5_rep2_CPM HFe_d5_rep3_CPM LFe_d5_rep1_CPM
## Tery_0001 " 89.95" " 79.68" " 72.19" " 99.83"
## Tery_0002 " 5.49" " 5.38" " 6.23" " 5.60"
## LFe_d5_rep2_CPM MFe_d5_rep1_CPM MFe_d5_rep2_CPM
## Tery_0001 " 97.53" " 84.73" " 73.85"
## Tery_0002 " 5.63" " 3.05" " 5.07"
# compute log2CPM
cat("\n==>", as.character(Sys.time()), " computing log2 transformed cpms ...\n")
##
## ==> 2024-04-24 21:02:52 computing log2 transformed cpms ...
cpms.log <- log2(cpms+0.01)
cpms.log.formatted <- format(cpms.log, digits=2, scientific = F)
colnames(cpms.log.formatted) <- paste(colnames(cpms), "_logCPM", sep = "")
cat("\n==> formated logCPM looks like this:\n")
##
## ==> formated logCPM looks like this:
head(cpms.log.formatted, 2)
## HFe_d5_rep1_logCPM HFe_d5_rep2_logCPM HFe_d5_rep3_logCPM
## Tery_0001 " 6.4913" " 6.3164" " 6.1740"
## Tery_0002 " 2.4581" " 2.4314" " 2.6415"
## LFe_d5_rep1_logCPM LFe_d5_rep2_logCPM MFe_d5_rep1_logCPM
## Tery_0001 " 6.6415" " 6.6080" " 6.4050"
## Tery_0002 " 2.4882" " 2.4958" " 1.6114"
## MFe_d5_rep2_logCPM
## Tery_0001 " 6.2068"
## Tery_0002 " 2.3459"
cat("\n==>", as.character(Sys.time()), " writing out cpms ...\n")
##
## ==> 2024-04-24 21:02:52 writing out cpms ...
# annotate the cpm table, then write it out with annotations
rawCounts.selected <- rawTable[rownames(cpms), ]
#head(rawCounts.selected)
colnames(rawCounts.selected) <- paste(colnames(rawCounts.selected), "_RAW", sep = "")
#head(rawCounts.selected)
rawCount_cpms <- cbind(rawCounts.selected, cpms.formatted, cpms.log.formatted)
#head(rawCount_cpms)
annotations.selected <- annotations[annotations$locus_tag%in%rownames(rawCount_cpms), ]
#View(annotations.selected)
rownames(annotations.selected) <- annotations.selected$locus_tag
anno_cols <- c('locus_tag', 'geneID', 'feature', 'start', 'end', 'strand', 'seqname', 'product', 'note', 'dbxref')
annotations.selected <- annotations.selected[rownames(cpms.formatted), anno_cols]
#head(annotations.selected)
rawCount_cpms.anno <- cbind(rawCount_cpms, annotations.selected)
#head(rawCount_cpms.anno)
write.table(rawCount_cpms, file=paste(prefix, "_featureCounts_CPM.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
write.table(rawCount_cpms.anno, file=paste(prefix, "_featureCounts_CPM_anno.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
#pdf(file = paste(prefix, "_featureCounts_top1000_logCPM.pdf", sep=""))
pheatmap(cpms.log[1:500, ], cluster_cols = F)
#dev.off()
library(DESeq2)
library(ggplot2)
cat("\n==>", as.character(Sys.time()), " importing counts and metadata into DESeq: \n")
##
## ==> 2024-04-24 21:02:53 importing counts and metadata into DESeq:
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = metadata,
design= ~ treatment)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
cat("\n==>", as.character(Sys.time()), " total read number per sample are ... \n")
##
## ==> 2024-04-24 21:02:53 total read number per sample are ...
total_read_no <- as.data.frame(colSums(assay(dds)))
colnames(total_read_no) <- c("TotalReads")
total_read_no <- cbind(total_read_no, metadata[rownames(total_read_no), ])
total_read_no$SampleName <- rownames(total_read_no)
total_read_no
## TotalReads day replicate treatment treatment_day color pch
## HFe_d5_rep1 3433852 5 1 HFe HFe_d5 forestgreen 16
## HFe_d5_rep2 6134662 5 2 HFe HFe_d5 forestgreen 16
## HFe_d5_rep3 5446224 5 3 HFe HFe_d5 forestgreen 16
## LFe_d5_rep1 3025047 5 1 LFe LFe_d5 red 16
## LFe_d5_rep2 5521660 5 2 LFe LFe_d5 red 16
## MFe_d5_rep1 5754829 5 1 MFe MFe_d5 yellow3 16
## MFe_d5_rep2 3881938 5 2 MFe MFe_d5 yellow3 16
## SampleName
## HFe_d5_rep1 HFe_d5_rep1
## HFe_d5_rep2 HFe_d5_rep2
## HFe_d5_rep3 HFe_d5_rep3
## LFe_d5_rep1 LFe_d5_rep1
## LFe_d5_rep2 LFe_d5_rep2
## MFe_d5_rep1 MFe_d5_rep1
## MFe_d5_rep2 MFe_d5_rep2
my_theme1 <- theme_bw() +
theme(panel.background = element_blank(),
axis.line = element_line(colour = "black"),
text = element_text(size = 16),
axis.title.x = element_text(size=18, color="black"),
axis.title.y = element_text(size=18, color="black"),
axis.text.x = element_text(angle = 0, hjust = 1, color="black"),
panel.grid.minor.x = element_line(colour = "grey", linewidth=0.2, linetype = 'dashed'),
panel.grid.major.x = element_line(colour = "grey", linewidth=0.2),
panel.grid.minor.y = element_line(colour = "grey", linewidth = 0.2, linetype = 'dashed'),
panel.grid.major.y = element_line(colour = "grey", linewidth=0.2),
legend.position = "bottom",
legend.text=element_text(size=10),
legend.key.size = unit(1,"line"),
plot.margin=unit(c(1,1,1,1),"cm")
)
p_reads <- ggplot(total_read_no, aes(x = SampleName, y = TotalReads, fill = factor(color))) +
geom_bar(stat="identity", width=0.5) +
geom_abline(slope=0, intercept=3e6, col="red",lty=2) +
scale_fill_manual(values = c("forestgreen", "red", "yellow3"), labels = c("HFe", "LFe", "MFe")) +
labs(y = "Total Read Number", x = "Sample") + my_theme1 +
theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10), axis.text.y = element_text(size=10))
ggsave(filename=paste(prefix, "_sample_read_number_barplot.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_reads
cat("\n==>", as.character(Sys.time()), " running vst transformation ... \n")
##
## ==> 2024-04-24 21:02:54 running vst transformation ...
vsd <- vst(dds, blind=FALSE)
vst_count <- assay(vsd)
cat("\n==>", as.character(Sys.time()), " running rlog transformation ... \n")
##
## ==> 2024-04-24 21:02:55 running rlog transformation ...
rld <- rlog(dds, blind=FALSE)
rlog_count <- assay(rld)
cat("\n==>", as.character(Sys.time()), " vst transformed counts look like: \n")
##
## ==> 2024-04-24 21:02:56 vst transformed counts look like:
head(vst_count, 3)
## HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2
## Tery_0001 8.797390 8.641845 8.508486 8.936019 8.905984
## Tery_0002 5.781632 5.770847 5.877970 5.796746 5.801250
## Tery_0006 5.781632 6.014847 6.448433 5.839421 5.533193
## MFe_d5_rep1 MFe_d5_rep2
## Tery_0001 8.724222 8.548865
## Tery_0002 5.398637 5.731818
## Tery_0006 5.398637 6.585203
cat("\n==>", as.character(Sys.time()), " rlog transformed counts look like: \n")
##
## ==> 2024-04-24 21:02:56 rlog transformed counts look like:
head(rlog_count, 3)
## HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2
## Tery_0001 8.649126 8.542002 8.450973 8.744031 8.725327
## Tery_0002 4.584050 4.579917 4.652827 4.591704 4.600004
## Tery_0006 4.864530 5.017129 5.334146 4.903892 4.692005
## MFe_d5_rep1 MFe_d5_rep2
## Tery_0001 8.598987 8.480152
## Tery_0002 4.340565 4.553361
## Tery_0006 4.609619 5.410122
# write vst and rlog counts
write.table(vst_count, file=paste(prefix, "_vst_count.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
write.table(rlog_count, file=paste(prefix, "_rlog_count.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
# boxplot of vst and rlog transformed counts
# plot normalized count
logCount <- as.data.frame(log10(assay(dds)+1)) %>% rownames_to_column %>%
gather(key="SampleName", value="Counts", -rowname)
p_logCount <- ggplot(logCount, aes(x=SampleName, y=Counts)) +
geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1) +
labs(y = "log counts", x = "Sample") + theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10))
vst_count.long <- as.data.frame(vst_count) %>% rownames_to_column %>%
gather(key="SampleName", value="Counts", -rowname)
p_vst <- ggplot(vst_count.long, aes(x=SampleName, y=Counts)) +
geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1) +
labs(y = "vst counts", x = "Sample") + theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10))
rlog_count.long <- as.data.frame(rlog_count) %>% rownames_to_column %>%
gather(key="SampleName", value="Counts", -rowname)
p_rlog <- ggplot(rlog_count.long, aes(x=SampleName, y=Counts)) +
geom_boxplot(outlier.colour="black", outlier.shape=1, outlier.size=1) +
labs(y = "rlog counts", x = "Sample") + theme(axis.text.x = element_text(angle = 60, hjust = 1, size=10))
p_counts <- p_logCount + p_vst + p_rlog
ggsave(filename=paste(prefix, "_sample_vst_rlog_norm_boxplot.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_counts
DESeq2# use ggrepel to label the dots
library(ggrepel)
# PCA based on vsd
pcaData <- plotPCA(vsd, intgroup=c("treatment", "day"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p_vst_pca <- ggplot(pcaData, aes(PC1, PC2)) +
geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) +
coord_fixed() +
ggtitle("PCA based on VST counts") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_text_repel(aes(label=rownames(metadata)), size=3) +
#geom_text(aes(label=rownames(metadata), hjust=0.5, vjust=-0.4)) +
scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) +
scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +
theme_bw() +
theme(legend.position="bottom")
# PCA based on rlog
pcaData <- plotPCA(rld, intgroup=c("treatment", "day"), returnData=TRUE)
percentVar <- round(100 * attr(pcaData, "percentVar"))
p_rlog_pca <- ggplot(pcaData, aes(PC1, PC2)) +
geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) +
coord_fixed() +
ggtitle("PCA based on rlog counts") +
xlab(paste0("PC1: ",percentVar[1],"% variance")) +
ylab(paste0("PC2: ",percentVar[2],"% variance")) +
geom_text_repel(aes(label=rownames(metadata)), size=3) +
#geom_text(aes(label=rownames(metadata), hjust=0.5, vjust=-0.4)) +
scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) +
scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +
theme_bw() +
theme(legend.position="bottom")
p_pca <- p_vst_pca + p_rlog_pca
ggsave(filename=paste(prefix, "_sample_vst_rlog_PCA.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_pca
# clustering using Euclidean distance based on vst counts
vst_dist <- dist(t(vst_count))
# the agglomeration method can be "ward.D", "ward.D2", "single", "complete",
# "average" (= UPGMA), "mcquitty" (= WPGMA), "median" (= WPGMC) or "centroid" (= UPGMC)
vst_clust <- hclust(vst_dist, method="ward.D2")
vst_dend <- as.dendrogram(vst_clust, hang=0.1)
vst_dend_cols <- as.character(metadata$color[order.dendrogram(vst_dend)])
labels_colors(vst_dend) <- vst_dend_cols
# save to pdf
#pdf(file = paste(prefix, "_sample_vst_dendrogram.pdf", sep=""))
plot(vst_dend, ylab="Euclidean Distance based on VST counts")
#dev.off()
# clustering using Euclidean distance based on rlog counts
rlog_dist <- dist(t(rlog_count))
rlog_clust <- hclust(rlog_dist, method="ward.D2")
rlog_dend <- as.dendrogram(rlog_clust, hang=0.1)
rlog_dend_cols <- as.character(metadata$color[order.dendrogram(rlog_dend)])
labels_colors(rlog_dend) <- rlog_dend_cols
#pdf(file = paste(prefix, "_sample_rlog_dendrogram.pdf", sep=""))
plot(rlog_dend, ylab="Euclidean Distance based on rlog counts")
#dev.off()
PhyloSeq with vst counts# making our phyloseq object with transformed table from vst count
vst_count_phy <- otu_table(vst_count, taxa_are_rows=T)
metadata_phy <- sample_data(metadata)
vst_phyloseq <- phyloseq(vst_count_phy, metadata_phy)
vst_pcoa <- ordinate(vst_phyloseq, method="MDS", distance="euclidean")
# eigen_vals allows us to scale the axes according to their magnitude of separating apart the samples
eigen_vals <- vst_pcoa$values$Eigenvalues
p_vst_pcoa <- plot_ordination(vst_phyloseq, vst_pcoa) +
geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) +
labs(col="treatment") +
geom_text_repel(aes(label=rownames(metadata)), size=3) +
coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle("PCoA based on VST counts") +
scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) +
scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +
theme_bw() +
theme(legend.position="bottom")
ggsave(filename=paste(prefix, "_sample_vst_PCoA.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
# making our phyloseq object with transformed table from rlog count
rlog_count_phy <- otu_table(rlog_count, taxa_are_rows=T)
metadata_phy <- sample_data(metadata)
rlog_phyloseq <- phyloseq(rlog_count_phy, metadata_phy)
rlog_pcoa <- ordinate(rlog_phyloseq, method="MDS", distance="euclidean")
eigen_vals <- rlog_pcoa$values$Eigenvalues
p_rlog_pcoa <- plot_ordination(rlog_phyloseq, rlog_pcoa) +
geom_point(aes(color=treatment, shape=as.character(metadata$pch)), size=3) +
labs(col="treatment") +
geom_text_repel(aes(label=rownames(metadata)), size=3) +
coord_fixed(sqrt(eigen_vals[2]/eigen_vals[1])) + ggtitle("PCoA based on rlog counts") +
scale_color_manual(values=unique(as.character(metadata$color)[order(metadata$treatment)])) +
scale_shape_manual(values=unique(as.numeric(metadata$pch)[order(metadata$pch)])) +
theme_bw() +
theme(legend.position="bottom")
p_pcoa <- p_vst_pcoa + p_rlog_pcoa
ggsave(filename=paste(prefix, "_sample_vst_rlog_PCoA.pdf", sep=""), device="pdf", width=297, height=210, units="mm" )
p_pcoa
# function to write DE tables
write_DE_tables <- function(contrast, suffix_str){
contrast <- contrast[order(contrast$padj, decreasing=F),]
anno <- rawCount_cpms.anno[rownames(contrast), ]
contrast.anno <- cbind(contrast, anno)
write.table(contrast, file=paste(prefix, suffix_str, ".tsv", sep=""), sep="\t", row.names = T, col.names=NA)
write.table(contrast.anno, file=paste(prefix, suffix_str, "_anno.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
write.table(contrast.anno[contrast$log2FoldChange>0, ], file=paste(prefix, suffix_str, "_anno_up.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
write.table(contrast.anno[contrast$log2FoldChange<=0, ], file=paste(prefix, suffix_str, "_anno_dn.tsv", sep=""), sep="\t", row.names = T, col.names=NA)
}
# construct DESeqDataSet object
dds <- DESeqDataSetFromMatrix(countData = counts,
colData = metadata,
design= ~ treatment)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
# set the baseline treatment as the "LFe" treatment
dds$treatment <- relevel(dds$treatment, ref = "LFe")
# run deseq standard analysis
design(dds) <- ~treatment
dds.treatment <- DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
# extract size factors
size_facctors <- sizeFactors(dds.treatment)
size_facctors
## HFe_d5_rep1 HFe_d5_rep2 HFe_d5_rep3 LFe_d5_rep1 LFe_d5_rep2 MFe_d5_rep1
## 0.8004557 1.4216113 1.1968956 0.6665582 1.2081094 1.2196937
## MFe_d5_rep2
## 0.7717292
# Dispersion plot
head(dispersions(dds.treatment))
## [1] 0.02286648 0.06576208 0.12845667 0.02085939 0.03464709 0.03714544
plotDispEsts(dds.treatment, cex=0.5, ymin=1e-05)
# get per level baseMean
baseMeanPerLvl <- sapply(levels(dds$treatment),
function(lvl) rowMeans(counts(dds.treatment,normalized=TRUE)[,dds$treatment == lvl]))
colnames(baseMeanPerLvl) <- sapply(levels(dds$treatment), function(lvl) paste0("baseMean_", lvl))
head(baseMeanPerLvl)
## baseMean_LFe baseMean_HFe baseMean_MFe
## Tery_0001 449.53505 367.77561 363.64980
## Tery_0002 25.58204 26.00418 18.63108
## Tery_0006 22.19346 37.96431 38.71585
## Tery_0008 518.28818 509.83442 511.90678
## Tery_0011 184.42879 222.08368 235.84973
## Tery_0012 365.20928 373.97958 356.84321
# ------------------
# HFe vs LFe samples
# ------------------
HFe_vs_LFe_contrast <- results(dds.treatment, alpha=0.05, contrast=c("treatment", "HFe", "LFe"))
head(HFe_vs_LFe_contrast)
## log2 fold change (MLE): treatment HFe vs LFe
## Wald test p-value: treatment HFe vs LFe
## DataFrame with 6 rows and 6 columns
## baseMean log2FoldChange lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric> <numeric> <numeric>
## Tery_0001 389.9566 -0.2915776 0.209951 -1.3887889 0.164897 0.517665
## Tery_0002 23.7770 0.0243618 0.428993 0.0567885 0.954714 NA
## Tery_0006 33.6731 0.8111985 0.541987 1.4967132 0.134468 0.466698
## Tery_0008 512.8419 -0.0213121 0.199397 -0.1068829 0.914882 0.978458
## Tery_0011 215.2583 0.2596615 0.263532 0.9853132 0.324470 0.707095
## Tery_0012 366.5777 0.0405920 0.263543 0.1540241 0.877591 0.967661
# combine res with baseMeanPerLvl
HFe_vs_LFe_contrast.lvl <- cbind(HFe_vs_LFe_contrast['baseMean'],
baseMeanPerLvl,
HFe_vs_LFe_contrast[, 2:ncol(HFe_vs_LFe_contrast)])
# check the results
head(HFe_vs_LFe_contrast.lvl)
## DataFrame with 6 rows and 9 columns
## baseMean baseMean_LFe baseMean_HFe baseMean_MFe log2FoldChange
## <numeric> <numeric> <numeric> <numeric> <numeric>
## Tery_0001 389.9566 449.5351 367.7756 363.6498 -0.2915776
## Tery_0002 23.7770 25.5820 26.0042 18.6311 0.0243618
## Tery_0006 33.6731 22.1935 37.9643 38.7158 0.8111985
## Tery_0008 512.8419 518.2882 509.8344 511.9068 -0.0213121
## Tery_0011 215.2583 184.4288 222.0837 235.8497 0.2596615
## Tery_0012 366.5777 365.2093 373.9796 356.8432 0.0405920
## lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric>
## Tery_0001 0.209951 -1.3887889 0.164897 0.517665
## Tery_0002 0.428993 0.0567885 0.954714 NA
## Tery_0006 0.541987 1.4967132 0.134468 0.466698
## Tery_0008 0.199397 -0.1068829 0.914882 0.978458
## Tery_0011 0.263532 0.9853132 0.324470 0.707095
## Tery_0012 0.263543 0.1540241 0.877591 0.967661
# write
write_DE_tables(HFe_vs_LFe_contrast, "_HFe_vs_LFe_contrast")
write_DE_tables(HFe_vs_LFe_contrast.lvl, "_HFe_vs_LFe_contrast_lvl")
# plot MA
par(mar = c(4, 4, .1, .1))
plotMA(HFe_vs_LFe_contrast, ylim=c(-2,2), main="HFe_vs_LFe", colSig = "blue", cex=.8)
Filtering criteria The goal of independent filtering is to filter out those tests from the procedure that have no, or little chance of showing significant evidence, without even looking at their test statistic.
Genes with very low counts are not likely to see significant differences typically due to high dispersion. For example, we can plot the −log10(p-values) from all genes over the normalized mean counts.
plot(HFe_vs_LFe_contrast$baseMean+1, -log10(HFe_vs_LFe_contrast$pvalue), main="HFe_vs_LFe",
log="x", xlab="mean of normalized counts",
ylab=expression(-log[10](pvalue)),
ylim=c(0,30),
cex=.4, col=rgb(0,0,0,.3))
# p-value histogram
# the p value histogram below shows how the filtering ameliorates the multiple testing problem,
# by removing a background set of hypotheses whose p values are distributed more or less uniformly in [0,1].
cat("\n==>", as.character(Sys.time()), "Filtering criteria:", metadata(HFe_vs_LFe_contrast)$filterThreshold, "\n")
##
## ==> 2024-04-24 21:03:03 Filtering criteria: 33.37568
use <- HFe_vs_LFe_contrast$baseMean > metadata(HFe_vs_LFe_contrast)$filterThreshold
h1 <- hist(HFe_vs_LFe_contrast$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(HFe_vs_LFe_contrast$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")
# Histogram of p values for all tests.
# The area shaded in blue indicates the subset of those that pass the filtering, the area in khaki those that do not pass
{
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "HFe_vs_LFe", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))
}
# ------------------
# MFe vs LFe samples
# ------------------
MFe_vs_LFe_contrast <- results(dds.treatment, alpha=0.05, contrast=c("treatment", "MFe", "LFe"))
# combine res with baseMeanPerLvl
MFe_vs_LFe_contrast.lvl <- cbind(MFe_vs_LFe_contrast['baseMean'],
baseMeanPerLvl,
MFe_vs_LFe_contrast[, 2:ncol(MFe_vs_LFe_contrast)])
# check the results
head(MFe_vs_LFe_contrast.lvl)
## DataFrame with 6 rows and 9 columns
## baseMean baseMean_LFe baseMean_HFe baseMean_MFe log2FoldChange
## <numeric> <numeric> <numeric> <numeric> <numeric>
## Tery_0001 389.9566 449.5351 367.7756 363.6498 -0.3029813
## Tery_0002 23.7770 25.5820 26.0042 18.6311 -0.4968857
## Tery_0006 33.6731 22.1935 37.9643 38.7158 0.7918668
## Tery_0008 512.8419 518.2882 509.8344 511.9068 -0.0150832
## Tery_0011 215.2583 184.4288 222.0837 235.8497 0.3572647
## Tery_0012 366.5777 365.2093 373.9796 356.8432 -0.0213793
## lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric>
## Tery_0001 0.230806 -1.3127124 0.189280 0.497956
## Tery_0002 0.491788 -1.0103657 0.312320 NA
## Tery_0006 0.592055 1.3374879 0.181063 NA
## Tery_0008 0.218807 -0.0689337 0.945042 0.982363
## Tery_0011 0.288434 1.2386348 0.215481 0.530029
## Tery_0012 0.289283 -0.0739045 0.941086 0.981672
# write
write_DE_tables(MFe_vs_LFe_contrast, "_MFe_vs_LFe_contrast")
write_DE_tables(MFe_vs_LFe_contrast, "_MFe_vs_LFe_contrast_lvl")
# plot MA
par(mar = c(4, 4, .1, .1))
plotMA(MFe_vs_LFe_contrast, ylim=c(-2,2), main="MFe_vs_LFe", colSig = "blue", cex=.8)
plot(MFe_vs_LFe_contrast$baseMean+1, -log10(MFe_vs_LFe_contrast$pvalue), main="MFe_vs_LFe",
log="x", xlab="mean of normalized counts",
ylab=expression(-log[10](pvalue)),
ylim=c(0,30),
cex=.4, col=rgb(0,0,0,.3))
# p-value histogram
# the p value histogram below shows how the filtering ameliorates the multiple testing problem,
# by removing a background set of hypotheses whose p values are distributed more or less uniformly in [0,1].
cat("\n==>", as.character(Sys.time()), "Filtering criteria:", metadata(MFe_vs_LFe_contrast)$filterThreshold, "\n")
##
## ==> 2024-04-24 21:03:04 Filtering criteria: 75.08633
use <- MFe_vs_LFe_contrast$baseMean > metadata(MFe_vs_LFe_contrast)$filterThreshold
h1 <- hist(MFe_vs_LFe_contrast$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(MFe_vs_LFe_contrast$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")
# Histogram of p values for all tests.
# The area shaded in blue indicates the subset of those that pass the filtering, the area in khaki those that do not pass
{
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "MFe_vs_LFe", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))
}
# ------------------
# HFe vs MFe samples
# ------------------
HFe_vs_MFe_contrast <- results(dds.treatment, alpha=0.05, contrast=c("treatment", "HFe", "MFe"))
# combine res with baseMeanPerLvl
HFe_vs_MFe_contrast.lvl <- cbind(HFe_vs_MFe_contrast['baseMean'],
baseMeanPerLvl,
HFe_vs_MFe_contrast[, 2:ncol(HFe_vs_MFe_contrast)])
# check the results
head(HFe_vs_MFe_contrast.lvl)
## DataFrame with 6 rows and 9 columns
## baseMean baseMean_LFe baseMean_HFe baseMean_MFe log2FoldChange
## <numeric> <numeric> <numeric> <numeric> <numeric>
## Tery_0001 389.9566 449.5351 367.7756 363.6498 0.01140375
## Tery_0002 23.7770 25.5820 26.0042 18.6311 0.52124749
## Tery_0006 33.6731 22.1935 37.9643 38.7158 0.01933172
## Tery_0008 512.8419 518.2882 509.8344 511.9068 -0.00622894
## Tery_0011 215.2583 184.4288 222.0837 235.8497 -0.09760323
## Tery_0012 366.5777 365.2093 373.9796 356.8432 0.06197137
## lfcSE stat pvalue padj
## <numeric> <numeric> <numeric> <numeric>
## Tery_0001 0.210741 0.0541127 0.956845 0.998313
## Tery_0002 0.444496 1.1726719 0.240927 0.846590
## Tery_0006 0.518104 0.0373124 0.970236 0.998313
## Tery_0008 0.198940 -0.0313106 0.975022 0.998313
## Tery_0011 0.260008 -0.3753860 0.707373 0.979413
## Tery_0012 0.263072 0.2355677 0.813768 0.985314
# write
write_DE_tables(HFe_vs_MFe_contrast, "_HFe_vs_MFe_contrast")
write_DE_tables(HFe_vs_MFe_contrast, "_HFe_vs_MFe_contrast_lvl")
# plot MA
par(mar = c(4, 4, .1, .1))
plotMA(HFe_vs_MFe_contrast, ylim=c(-2,2), main="HFe_vs_MFe", colSig = "blue", cex=.8)
plot(HFe_vs_MFe_contrast$baseMean+1, -log10(HFe_vs_MFe_contrast$pvalue), main="HFe_vs_MFe",
log="x", xlab="mean of normalized counts",
ylab=expression(-log[10](pvalue)),
ylim=c(0,30),
cex=.4, col=rgb(0,0,0,.3))
# p-value histogram
# the p value histogram below shows how the filtering ameliorates the multiple testing problem,
# by removing a background set of hypotheses whose p values are distributed more or less uniformly in [0,1].
cat("\n==>", as.character(Sys.time()), "Filtering criteria:", metadata(HFe_vs_MFe_contrast)$filterThreshold, "\n")
##
## ==> 2024-04-24 21:03:05 Filtering criteria: 4.167553
use <- HFe_vs_MFe_contrast$baseMean > metadata(HFe_vs_MFe_contrast)$filterThreshold
h1 <- hist(HFe_vs_MFe_contrast$pvalue[!use], breaks=0:50/50, plot=FALSE)
h2 <- hist(HFe_vs_MFe_contrast$pvalue[use], breaks=0:50/50, plot=FALSE)
colori <- c(`do not pass`="khaki", `pass`="powderblue")
# Histogram of p values for all tests.
# The area shaded in blue indicates the subset of those that pass the filtering, the area in khaki those that do not pass
{
barplot(height = rbind(h1$counts, h2$counts), beside = FALSE, col = colori, space = 0, main = "HFe_vs_MFe", ylab="frequency")
text(x = c(0, length(h1$counts)), y = 0, label = paste(c(0,1)), adj = c(0.5,1.7), xpd=NA)
legend("topright", fill=rev(colori), legend=rev(names(colori)))
}
# -------
# summary
# -------
# we can get a glimpse at what this table currently holds with the summary command
summary(HFe_vs_LFe_contrast)
##
## out of 4249 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 181, 4.3%
## LFC < 0 (down) : 127, 3%
## outliers [1] : 0, 0%
## low counts [2] : 577, 14%
## (mean count < 33)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(MFe_vs_LFe_contrast)
##
## out of 4249 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 156, 3.7%
## LFC < 0 (down) : 59, 1.4%
## outliers [1] : 0, 0%
## low counts [2] : 1071, 25%
## (mean count < 75)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
summary(HFe_vs_MFe_contrast)
##
## out of 4249 with nonzero total read count
## adjusted p-value < 0.05
## LFC > 0 (up) : 33, 0.78%
## LFC < 0 (down) : 46, 1.1%
## outliers [1] : 0, 0%
## low counts [2] : 83, 2%
## (mean count < 4)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
# make a stat table based on the summary
DE_stats <- "
contrast up down
HFe_vs_LFe 181 127
HFe_vs_MFe 33 46
MFe_vs_LFe 156 59
"
de_stat_df <- read.table(text=DE_stats, header=T)
de_stat_df
## contrast up down
## 1 HFe_vs_LFe 181 127
## 2 HFe_vs_MFe 33 46
## 3 MFe_vs_LFe 156 59
mat <- de_stat_df[, c("up", "down")]
rownames(mat) <- de_stat_df$contrast
# save to pdf
#pdf(file = paste(prefix, "_DESeq2_DE_Overview.pdf", sep=""))
pheatmap(mat=mat, angle_col = 0, fontsize_col = 10, fontsize_row = 10)
#dev.off()
# get gene names
get_de_gene_names <- function(DESeq_contrast, padj_cutoff=0.01, log2FC_cutoff=1) {
library(tidyverse)
genes.sig_up <- as.data.frame(DESeq_contrast) %>%
rownames_to_column('locus_tag') %>%
filter(!is.na(padj)) %>%
filter(padj <= padj_cutoff, log2FoldChange >= log2FC_cutoff) %>%
column_to_rownames('locus_tag') %>% rownames()
genes.sig_dn <- as.data.frame(DESeq_contrast) %>%
rownames_to_column('locus_tag') %>%
filter(!is.na(padj)) %>%
filter(padj <= padj_cutoff, log2FoldChange <= -1*log2FC_cutoff) %>%
column_to_rownames('locus_tag') %>% rownames()
return(list(genes.sig_up, genes.sig_dn))
}
padj_cutoff = 0.05
logFC_cutoff = 0.585
# HFe_vs_LFe sig
HFe_vs_LFe_sig <- get_de_gene_names(HFe_vs_LFe_contrast,
padj_cutoff=padj_cutoff,
log2FC_cutoff=logFC_cutoff)
HFe_vs_LFe_sig.up <- HFe_vs_LFe_sig[[1]]
HFe_vs_LFe_sig.dn <- HFe_vs_LFe_sig[[2]]
# HFe_vs_MFe sig
HFe_vs_MFe_sig <- get_de_gene_names(HFe_vs_MFe_contrast,
padj_cutoff=padj_cutoff,
log2FC_cutoff=logFC_cutoff)
HFe_vs_MFe_sig.up <- HFe_vs_MFe_sig[[1]]
HFe_vs_MFe_sig.dn <- HFe_vs_MFe_sig[[2]]
# MFe_vs_LFe sig
MFe_vs_LFe_sig <- get_de_gene_names(MFe_vs_LFe_contrast,
padj_cutoff=padj_cutoff,
log2FC_cutoff=logFC_cutoff)
MFe_vs_LFe_sig.up <- MFe_vs_LFe_sig[[1]]
MFe_vs_LFe_sig.dn <- MFe_vs_LFe_sig[[2]]
#BiocManager::install('eulerr')
library(eulerr)
up.list <- list(
HFe_vs_LFe = HFe_vs_LFe_sig.up,
HFe_vs_MFe = HFe_vs_MFe_sig.up,
MFe_vs_LFe = MFe_vs_LFe_sig.up)
dn.list <- list(
HFe_vs_LFe = HFe_vs_LFe_sig.dn,
HFe_vs_MFe = HFe_vs_MFe_sig.dn,
MFe_vs_LFe = MFe_vs_LFe_sig.dn)
#pdf(file = paste(prefix, "_Eule_diagram_padj0.05_logFC0.585_up.pdf", sep=""))
plot(euler(up.list, shape = "ellipse"), main="Euler diagram of up regulated genes",
fills = c("limegreen", "dodgerblue", "darkgoldenrod1"),
edges = FALSE,
fontsize = 8,
quantities = TRUE,
legend=TRUE)
#dev.off()
#pdf(file = paste(prefix, "_Eule_diagram_padj0.05_logFC0.585_dn.pdf", sep=""))
plot(euler(dn.list, shape = "ellipse"), main="Euler diagram of down regulated genes",
fills = c("limegreen", "dodgerblue", "darkgoldenrod1"),
edges = FALSE,
fontsize = 8,
quantities = TRUE,
legend=TRUE)
#dev.off()
# visualizing top gene (based on adj. p-value)
topGene <- rownames(HFe_vs_LFe_contrast)[which.min(HFe_vs_LFe_contrast$padj)]
topGene
## [1] "Tery_2483"
data <- plotCounts(dds.treatment, gene=topGene, intgroup=c("treatment"), returnData = T)
data$treatment <- factor(data$treatment, levels=c("LFe", "MFe", "HFe"))
data$replicate <- factor(colData(dds.treatment)$replicate, levels=c(1, 2, 3))
ggplot(data, aes(x=treatment, y=count, fill=treatment)) +
scale_y_log10() + theme_bw() +
geom_violin(alpha=0.3) +
geom_jitter(aes(color=treatment), shape=16, position=position_jitter(0.2)) +
geom_smooth(method = "lm", se = TRUE) +
ggtitle(topGene)
## `geom_smooth()` using formula = 'y ~ x'
# boxplot of differnet groups
# specify the comparisons you want
my_comparisons <- list(c("HFe", "LFe"), c("HFe", "MFe"), c("MFe", "LFe"))
p_gene <- ggpubr::ggboxplot(data, x = "treatment", y = "count",
color = "treatment", palette = "jco",
add = "jitter") +
rotate_x_text(angle = 45) +
geom_hline(yintercept = mean(data$count), linetype=2) + # Add horizontal line at base mean
stat_compare_means(method="anova", label.y=50) + # Add global p-value
stat_compare_means(comparisons=my_comparisons, method="t.test", ref.group = "LFe") # Add pairwise comparisons p-value
p_gene
sessionInfo()
## R version 4.2.1 (2022-06-23)
## Platform: x86_64-apple-darwin17.0 (64-bit)
## Running under: macOS Big Sur ... 10.16
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## attached base packages:
## [1] stats4 splines stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] eulerr_7.0.1 ggrepel_0.9.5
## [3] ggpubr_0.6.0 lubridate_1.9.3
## [5] forcats_1.0.0 stringr_1.5.1
## [7] dplyr_1.1.4 purrr_1.0.2
## [9] readr_2.1.5 tidyr_1.3.1
## [11] tibble_3.2.1 tidyverse_2.0.0
## [13] patchwork_1.2.0 pheatmap_1.0.12
## [15] apeglm_1.18.0 ggplot2_3.5.0
## [17] phyloseq_1.42.0 dendextend_1.17.1
## [19] DESeq2_1.36.0 SummarizedExperiment_1.26.1
## [21] MatrixGenerics_1.8.1 matrixStats_1.2.0
## [23] GenomicRanges_1.48.0 GenomeInfoDb_1.32.4
## [25] IRanges_2.30.1 S4Vectors_0.34.0
## [27] NOISeq_2.40.0 Matrix_1.6-5
## [29] Biobase_2.56.0 BiocGenerics_0.42.0
## [31] edgeR_3.38.4 limma_3.52.4
## [33] BiocManager_1.30.22
##
## loaded via a namespace (and not attached):
## [1] backports_1.4.1 systemfonts_1.0.6 plyr_1.8.9
## [4] igraph_2.0.3.9008 polylabelr_0.2.0 BiocParallel_1.30.4
## [7] digest_0.6.35 foreach_1.5.2 htmltools_0.5.8
## [10] viridis_0.6.5 fansi_1.0.6 magrittr_2.0.3
## [13] memoise_2.0.1 cluster_2.1.6 tzdb_0.4.0
## [16] Biostrings_2.64.1 annotate_1.74.0 bdsmatrix_1.3-7
## [19] timechange_0.3.0 colorspace_2.1-0 blob_1.2.4
## [22] textshaping_0.3.7 xfun_0.43 crayon_1.5.2
## [25] RCurl_1.98-1.14 jsonlite_1.8.8 genefilter_1.78.0
## [28] survival_3.5-8 iterators_1.0.14 ape_5.7-1
## [31] glue_1.7.0 polyclip_1.10-6 gtable_0.3.4
## [34] zlibbioc_1.42.0 XVector_0.36.0 DelayedArray_0.22.0
## [37] car_3.1-2 Rhdf5lib_1.18.2 abind_1.4-5
## [40] scales_1.3.0 mvtnorm_1.2-4 DBI_1.2.2
## [43] rstatix_0.7.2 Rcpp_1.0.12 viridisLite_0.4.2
## [46] xtable_1.8-4 emdbook_1.3.13 bit_4.0.5
## [49] httr_1.4.7 RColorBrewer_1.1-3 farver_2.1.1
## [52] pkgconfig_2.0.3 XML_3.99-0.16.1 sass_0.4.9
## [55] locfit_1.5-9.9 utf8_1.2.4 labeling_0.4.3
## [58] tidyselect_1.2.1 rlang_1.1.3 reshape2_1.4.4
## [61] AnnotationDbi_1.58.0 munsell_0.5.0 tools_4.2.1
## [64] cachem_1.0.8 cli_3.6.2 generics_0.1.3
## [67] RSQLite_2.3.5 ade4_1.7-22 broom_1.0.5
## [70] evaluate_0.23 biomformat_1.24.0 fastmap_1.1.1
## [73] yaml_2.3.8 ragg_1.3.0 knitr_1.45
## [76] bit64_4.0.5 KEGGREST_1.36.3 nlme_3.1-164
## [79] compiler_4.2.1 rstudioapi_0.16.0 png_0.1-8
## [82] ggsignif_0.6.4 geneplotter_1.74.0 bslib_0.6.2
## [85] stringi_1.8.3 highr_0.10 lattice_0.22-6
## [88] ggsci_3.0.3 vegan_2.6-4 permute_0.9-7
## [91] multtest_2.52.0 vctrs_0.6.5 pillar_1.9.0
## [94] lifecycle_1.0.4 rhdf5filters_1.8.0 jquerylib_0.1.4
## [97] data.table_1.15.2 bitops_1.0-7 R6_2.5.1
## [100] gridExtra_2.3 codetools_0.2-19 MASS_7.3-60.0.1
## [103] rhdf5_2.40.0 withr_3.0.0 GenomeInfoDbData_1.2.8
## [106] mgcv_1.9-1 parallel_4.2.1 hms_1.1.3
## [109] grid_4.2.1 coda_0.19-4.1 rmarkdown_2.26
## [112] carData_3.0-5 bbmle_1.0.25.1 numDeriv_2016.8-1.1